source('utils.R')
#devtools::install_github("ujjwalkarn/xda")
library(knitr)
library(ggjoy)
#devtools::install_github("vsimko/corrplot")
library(corrplot)
# First run feature-engineering chunk from preprocessing.Rmd
df <- load_data()
df <- introduce_nas(df, 22.5, 'pH')
df <- df %>%
mutate(`other sulfur dioxide` = `total sulfur dioxide` - `free sulfur dioxide`) %>%
select(-`total sulfur dioxide`)
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names
Input variables (based on physicochemical tests): 1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide -> Removed 8 - other sulfur dioxide -> Created from ‘total sulfur dioxide’ - ‘free sulfur dioxide’ 9 - density 10 - pH 11 - sulphates 12 - alcohol 13 - wine_colour Output variable (based on sensory data): 14 - quality (score between 0 and 10)
The two datasets are related to red and white variants of the Portuguese “Vinho Verde” wine. For more details, consult: http://www.vinhoverde.pt/en/ or the reference [Cortez et al., 2009]. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.).
These datasets can be viewed as classification or regression tasks. The classes are ordered and not balanced (e.g. there are munch more normal wines than excellent or poor ones). Outlier detection algorithms could be used to detect the few excellent or poor wines. Also, we are not sure if all input variables are relevant. So it could be interesting to test feature selection methods.
Number of Instances: red wine - 1599; white wine - 4898.
Number of Attributes: 12 + output attribute
Note: several of the attributes may be correlated, thus it makes sense to apply some sort of feature selection.
xda::numSummary(df)
## n mean sd max min range nunique
## fixed acidity 6497 7.215 1.296 15.900 3.800 12.1000 106
## volatile acidity 6497 0.340 0.165 1.580 0.080 1.5000 187
## citric acid 6497 0.319 0.145 1.660 0.000 1.6600 89
## residual sugar 6497 5.443 4.758 65.800 0.600 65.2000 316
## chlorides 6497 0.056 0.035 0.611 0.009 0.6020 214
## free sulfur dioxide 6497 30.525 17.749 289.000 1.000 288.0000 135
## density 6497 0.995 0.003 1.039 0.987 0.0519 998
## pH 5036 3.220 0.161 4.010 2.720 1.2900 108
## sulphates 6497 0.531 0.149 2.000 0.220 1.7800 111
## alcohol 6497 10.492 1.193 14.900 8.000 6.9000 111
## quality 6497 5.818 0.873 9.000 3.000 6.0000 7
## other sulfur dioxide 6495 85.237 45.418 331.000 3.000 328.0000 251
## nzeros iqr lowerbound upperbound noutlier
## fixed acidity 0 1.30000 4.4500 9.650 357
## volatile acidity 0 0.17000 -0.0250 0.655 377
## citric acid 151 0.14000 0.0400 0.600 509
## residual sugar 0 6.30000 -7.6500 17.550 118
## chlorides 0 0.02700 -0.0025 0.106 286
## free sulfur dioxide 0 24.00000 -19.0000 77.000 62
## density 0 0.00465 0.9854 1.004 3
## pH 0 0.21000 2.7950 3.635 58
## sulphates 0 0.17000 0.1750 0.855 191
## alcohol 0 1.80000 6.8000 14.000 3
## quality 0 1.00000 3.5000 7.500 228
## other sulfur dioxide 0 61.00000 -36.5000 207.500 20
## kurtosis skewness mode miss miss% 1% 5%
## fixed acidity 5.054 1.722 6.800 0 0.0000 5.100 5.700
## volatile acidity 2.820 1.494 0.280 0 0.0000 0.120 0.160
## citric acid 2.393 0.472 0.300 0 0.0000 0.000 0.050
## residual sugar 4.353 1.435 2.000 0 0.0000 0.900 1.200
## chlorides 50.841 5.397 0.044 0 0.0000 0.021 0.028
## free sulfur dioxide 7.896 1.220 29.000 0 0.0000 4.000 6.000
## density 6.597 0.503 0.997 0 0.0000 0.989 0.990
## pH 0.348 0.378 NA 1461 22.4873 2.890 2.970
## sulphates 8.643 1.796 0.500 0 0.0000 0.300 0.350
## alcohol -0.533 0.565 9.500 0 0.0000 8.700 9.000
## quality 0.230 0.190 6.000 0 0.0000 4.000 5.000
## other sulfur dioxide -0.322 0.101 101.000 2 0.0308 6.000 10.000
## 25% 50% 75% 95% 99%
## fixed acidity 6.400 7.000 7.700 9.800 12.000
## volatile acidity 0.230 0.290 0.400 0.670 0.880
## citric acid 0.250 0.310 0.390 0.560 0.740
## residual sugar 1.800 3.000 8.100 15.000 18.200
## chlorides 0.038 0.047 0.065 0.102 0.186
## free sulfur dioxide 17.000 29.000 41.000 61.000 77.000
## density 0.992 0.995 0.997 0.999 1.001
## pH 3.110 3.210 3.320 3.500 3.640
## sulphates 0.430 0.510 0.600 0.790 0.990
## alcohol 9.500 10.300 11.300 12.700 13.400
## quality 5.000 6.000 6.000 7.000 8.000
## other sulfur dioxide 55.000 86.000 116.000 159.000 189.000
xda::charSummary(df)
## n miss miss% unique top5levels:count
## wine_colour 6497 0 0 2 white:4898, red:1599
plot_histograms(df)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 1461 rows containing non-finite values (stat_bin).
plot_boxplots(df)
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).
plot_joyplots(df)
## Picking joint bandwidth of 0.223
## Picking joint bandwidth of 0.0252
## Picking joint bandwidth of 0.0274
## Picking joint bandwidth of 0.471
## Picking joint bandwidth of 0.0024
## Picking joint bandwidth of 2.47
## Picking joint bandwidth of 4.93
## Warning: Removed 2 rows containing non-finite values (stat_density_ridges).
## Picking joint bandwidth of 0.000418
## Picking joint bandwidth of 0.0276
## Warning: Removed 1461 rows containing non-finite values
## (stat_density_ridges).
## Picking joint bandwidth of 0.0224
## Picking joint bandwidth of 0.211
## Picking joint bandwidth of 0.138
other_plots(df)
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1461 rows containing non-finite values (stat_boxplot).
boxplots_quality_colour_wine(df, 'red')
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 363 rows containing non-finite values (stat_boxplot).
boxplots_quality_colour_wine(df, 'white')
## Warning: Removed 1098 rows containing non-finite values (stat_boxplot).
corrs <- cor(drop_na(df) %>% select(-wine_colour))
corrs
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 0.2277 0.3282
## volatile acidity 0.2277 1.0000 -0.3769
## citric acid 0.3282 -0.3769 1.0000
## residual sugar -0.1062 -0.1896 0.1506
## chlorides 0.3158 0.4070 0.0181
## free sulfur dioxide -0.2847 -0.3530 0.1284
## density 0.4646 0.2828 0.1018
## pH -0.2583 0.2633 -0.3300
## sulphates 0.2944 0.2276 0.0434
## alcohol -0.1064 -0.0441 -0.0105
## quality -0.0855 -0.2668 0.0890
## other sulfur dioxide -0.3039 -0.3780 0.1847
## residual sugar chlorides free sulfur dioxide density
## fixed acidity -0.1062 0.3158 -0.2847 0.4646
## volatile acidity -0.1896 0.4070 -0.3530 0.2828
## citric acid 0.1506 0.0181 0.1284 0.1018
## residual sugar 1.0000 -0.1252 0.3948 0.5562
## chlorides -0.1252 1.0000 -0.2150 0.3825
## free sulfur dioxide 0.3948 -0.2150 1.0000 0.0205
## density 0.5562 0.3825 0.0205 1.0000
## pH -0.2642 0.0608 -0.1316 0.0104
## sulphates -0.1813 0.3706 -0.1847 0.2573
## alcohol -0.3571 -0.2649 -0.1695 -0.6857
## quality -0.0382 -0.2139 0.0604 -0.3112
## other sulfur dioxide 0.4544 -0.2845 0.4964 0.0273
## pH sulphates alcohol quality
## fixed acidity -0.2583 0.294395 -0.106399 -0.0855
## volatile acidity 0.2633 0.227578 -0.044143 -0.2668
## citric acid -0.3300 0.043369 -0.010538 0.0890
## residual sugar -0.2642 -0.181263 -0.357098 -0.0382
## chlorides 0.0608 0.370602 -0.264854 -0.2139
## free sulfur dioxide -0.1316 -0.184732 -0.169486 0.0604
## density 0.0104 0.257265 -0.685664 -0.3112
## pH 1.0000 0.201430 0.119000 0.0233
## sulphates 0.2014 1.000000 -0.000449 0.0397
## alcohol 0.1190 -0.000449 1.000000 0.4519
## quality 0.0233 0.039660 0.451941 1.0000
## other sulfur dioxide -0.2296 -0.260291 -0.253983 -0.0703
## other sulfur dioxide
## fixed acidity -0.3039
## volatile acidity -0.3780
## citric acid 0.1847
## residual sugar 0.4544
## chlorides -0.2845
## free sulfur dioxide 0.4964
## density 0.0273
## pH -0.2296
## sulphates -0.2603
## alcohol -0.2540
## quality -0.0703
## other sulfur dioxide 1.0000
corrplot.mixed(corrs, upper="ellipse", lower="number")
corrs_red <- cor(drop_na(df) %>% filter(wine_colour == 'red') %>% select(-wine_colour))
corrs_red
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 -0.25350 0.6785
## volatile acidity -0.2535 1.00000 -0.5502
## citric acid 0.6785 -0.55016 1.0000
## residual sugar 0.1307 0.00317 0.1428
## chlorides 0.0917 0.08131 0.1671
## free sulfur dioxide -0.1421 -0.01381 -0.0668
## density 0.6607 0.03383 0.3536
## pH -0.6923 0.24452 -0.5520
## sulphates 0.1759 -0.26871 0.2844
## alcohol -0.0679 -0.20490 0.1157
## quality 0.1322 -0.39549 0.2449
## other sulfur dioxide -0.0838 0.10177 0.0543
## residual sugar chlorides free sulfur dioxide density
## fixed acidity 0.13072 0.0917 -0.14215 0.66071
## volatile acidity 0.00317 0.0813 -0.01381 0.03383
## citric acid 0.14279 0.1671 -0.06681 0.35360
## residual sugar 1.00000 0.0766 0.22751 0.36876
## chlorides 0.07662 1.0000 -0.01434 0.19991
## free sulfur dioxide 0.22751 -0.0143 1.00000 0.00992
## density 0.36876 0.1999 0.00992 1.00000
## pH -0.09060 -0.2295 0.07305 -0.33358
## sulphates 0.01888 0.2948 0.06838 0.14063
## alcohol 0.03307 -0.2187 -0.07244 -0.51141
## quality 0.02677 -0.1248 -0.03584 -0.17299
## other sulfur dioxide 0.17310 0.0557 0.43171 0.09298
## pH sulphates alcohol quality
## fixed acidity -0.6923 0.1759 -0.0679 0.1322
## volatile acidity 0.2445 -0.2687 -0.2049 -0.3955
## citric acid -0.5520 0.2844 0.1157 0.2449
## residual sugar -0.0906 0.0189 0.0331 0.0268
## chlorides -0.2295 0.2948 -0.2187 -0.1248
## free sulfur dioxide 0.0730 0.0684 -0.0724 -0.0358
## density -0.3336 0.1406 -0.5114 -0.1730
## pH 1.0000 -0.1736 0.1903 -0.0778
## sulphates -0.1736 1.0000 0.1028 0.2696
## alcohol 0.1903 0.1028 1.0000 0.4768
## quality -0.0778 0.2696 0.4768 1.0000
## other sulfur dioxide -0.1148 0.0367 -0.2302 -0.1925
## other sulfur dioxide
## fixed acidity -0.0838
## volatile acidity 0.1018
## citric acid 0.0543
## residual sugar 0.1731
## chlorides 0.0557
## free sulfur dioxide 0.4317
## density 0.0930
## pH -0.1148
## sulphates 0.0367
## alcohol -0.2302
## quality -0.1925
## other sulfur dioxide 1.0000
corrplot.mixed(corrs_red, upper="ellipse", lower="number")
corrs_white <- cor(drop_na(df) %>% filter(wine_colour == 'white') %>% select(-wine_colour))
corrs_white
## fixed acidity volatile acidity citric acid
## fixed acidity 1.0000 -0.0285 0.2989
## volatile acidity -0.0285 1.0000 -0.1543
## citric acid 0.2989 -0.1543 1.0000
## residual sugar 0.0954 0.0747 0.1090
## chlorides 0.0338 0.0757 0.1198
## free sulfur dioxide -0.0579 -0.0985 0.0909
## density 0.2753 0.0355 0.1636
## pH -0.4338 -0.0250 -0.1652
## sulphates -0.0245 -0.0349 0.0576
## alcohol -0.1320 0.0652 -0.0797
## quality -0.1241 -0.1832 -0.0143
## other sulfur dioxide 0.1286 0.1583 0.0987
## residual sugar chlorides free sulfur dioxide density
## fixed acidity 0.0954 0.0338 -0.05788 0.2753
## volatile acidity 0.0747 0.0757 -0.09849 0.0355
## citric acid 0.1090 0.1198 0.09090 0.1636
## residual sugar 1.0000 0.1027 0.29086 0.8406
## chlorides 0.1027 1.0000 0.08545 0.2698
## free sulfur dioxide 0.2909 0.0854 1.00000 0.2819
## density 0.8406 0.2698 0.28193 1.0000
## pH -0.1938 -0.0983 0.01172 -0.0959
## sulphates -0.0267 0.0128 0.05330 0.0710
## alcohol -0.4484 -0.3663 -0.23854 -0.7744
## quality -0.1030 -0.2215 0.00725 -0.3100
## other sulfur dioxide 0.3442 0.1976 0.25333 0.4980
## pH sulphates alcohol quality
## fixed acidity -0.4338 -0.0245 -0.1320 -0.12405
## volatile acidity -0.0250 -0.0349 0.0652 -0.18318
## citric acid -0.1652 0.0576 -0.0797 -0.01428
## residual sugar -0.1938 -0.0267 -0.4484 -0.10295
## chlorides -0.0983 0.0128 -0.3663 -0.22152
## free sulfur dioxide 0.0117 0.0533 -0.2385 0.00725
## density -0.0959 0.0710 -0.7744 -0.30998
## pH 1.0000 0.1641 0.1251 0.11260
## sulphates 0.1641 1.0000 -0.0109 0.05610
## alcohol 0.1251 -0.0109 1.0000 0.44445
## quality 0.1126 0.0561 0.4444 1.00000
## other sulfur dioxide 0.0099 0.1414 -0.4182 -0.22366
## other sulfur dioxide
## fixed acidity 0.1286
## volatile acidity 0.1583
## citric acid 0.0987
## residual sugar 0.3442
## chlorides 0.1976
## free sulfur dioxide 0.2533
## density 0.4980
## pH 0.0099
## sulphates 0.1414
## alcohol -0.4182
## quality -0.2237
## other sulfur dioxide 1.0000
corrplot.mixed(corrs_white, upper="ellipse", lower="number")